

*SWITCHES
global getnames			1
global closematch		1 //exact matches and very close matches
global fuzzymatch		1 //jarowinkler matching

cd "$data"



if ${getnames}==1{
*BRING IN NAMES FROM SIMS
use "lottery_applicants_voting.dta" , clear

keep sasid
duplicates drop // 0 duplicates

save votersasids.dta, replace
count // 17,297

merge 1:m sasid using "$lotto\1_id matching\SIMSnameslong2.dta" , keep(1 3)
count // 175,715

drop year grade school _merge firstletter
duplicates drop
drop if lastname==""
sort sasid
count // 28,761

keep sasid *name* dob gender
duplicates drop
count // 20,783

foreach v of varlist lastname firstname mname dob{
	ren `v' `v'_v
	}
	
duplicates drop lastname firstname dob, force
ren dob_v DOB_v
gen birthmonth_v=month(DOB_v)
gen birthday_v=day(DOB_v)
gen birthyear_v=year(DOB_v)

save voternames.dta, replace
}

*EXACT MATCH
if $closematch==1{
use voternames.dta, clear

merge 1:m lastname firstname DOB using "$data\ma_voterfile_after1980_clean.dta", keep(1 3) nogen
count // 18,955

preserve

keep sasid lalvoterid
keep if !missing(lalvoterid)
duplicates drop sasid, force
count // 9,809

*Exact matches
save match1.dta, replace
restore

*unmatched remain
* try again with exact name plus same birthmonth and birthyear
merge m:1 sasid using match1.dta, keep(1) nogen //this is unmatched
drop  lalvoterid-ever_voter
duplicates tag lastname firstname birthmonth birthyear, gen(dup)
*drop duplicates so only unique remain
drop if dup>=1
drop dup
ren DOB_v DOB_s //rename so can compare the two DOBs
merge 1:m lastname firstname birthmonth birthyear using "$data\ma_voterfile_after1980_clean.dta", keep(1 3) nogen

preserve

keep sasid lalvoterid
keep if !missing(lalvoterid)
duplicates drop sasid, force
count //  78

*more matches (2)
save match2.dta, replace
restore

*unmatched remain
* try again with exact name plus same birthmonth and birthday
*make sure birthyear is close
merge m:1 sasid using match2.dta, keep(1) nogen
drop  lalvoterid-ever_voter
duplicates tag lastname firstname  birthmonth birthday, gen(dup)
*drop duplicates so only unique remain
drop if dup>=1
drop dup
ren birthyear_v birthyear_s //so can compare
merge 1:m lastname firstname  birthmonth birthday using "$data\ma_voterfile_after1980_clean.dta", keep(1 3) nogen

*only keep matches within a year of actual bday
keep if !missing(lalvoterid)
g flag =1 if birthyear_s==birthyear_v+1
replace flag = 1 if birthyear_s==birthyear_v-1
keep if flag ==1

drop flag


keep sasid lalvoterid
keep if !missing(lalvoterid)
duplicates drop sasid, force
count // 20

save match3.dta, replace

use match3.dta, clear
append using match1.dta
append using match2.dta

duplicates report sasid

save matched_to_voters.dta, replace

*WHO REMAINS?
use voternames.dta, clear
merge m:1 sasid using matched_to_voters.dta, keep(1) nogen

save "$data/remaining_charterapps_tomatch.dta", replace


* START WITH REMAINING LIST OF CHARTER APPLICANTS

use "remaining_charterapps_tomatch.dta", clear
count // 8,020 remaining

*********************** Try for some more close matches ************************

* Match on first and middle initials, last name, and exact DOB
	gen finitial_v = substr(firstname_v, 1,1)
	gen minitial_v = substr(mname_v, 1,1)
	
	ren firstname firstname_s
	ren mname_v mname_s
		
		duplicates tag lastname finitial minitial DOB, gen(dup)
		*drop duplicates so only unique remain
		drop if dup>=1 // 371
		drop dup 

	*mostly small spelling differences are caught here
	merge 1:m lastname finitial minitial DOB using "$data\ma_voterfile_after1980_clean.dta", keep(1 3 4 5)  nogen update
	
	preserve
	keep sasid lalvoterid
	keep if !missing(lalvoterid)
	keep if !missing(sasid)
	duplicates drop sasid, force
	count // 161
	save match4.dta, replace
	restore

	* Limit to unmatched remaining
	merge m:1 sasid using match4.dta, keep(1) nogen //this is unmatched
	drop  lalvoterid-ever_voter
	
	ren firstname_s firstname_v
	ren mname_s mname_v
		

save "$data/remaining_charterapps_tomatch_V2.dta", replace
	
************************* Merge in nickname crosswalk **************************

* Merge in nickname crosswalk 

	*import
	import delimited "$raw\VOTER\nickname_crosswalk.csv", varnames(1) clear
	replace full_name=upper(trim(itrim(full_name)))
		rename full_name firstname_v
	replace first_name=upper(trim(itrim(first_name)))
		rename first_name nickname_v
	save "$raw\VOTER\nickname_crosswalk.dta", replace
	clear
	
	*merge remaining with nicknames
	use "$data/remaining_charterapps_tomatch_V2.dta"
	joinby firstname_v using "$raw\VOTER\nickname_crosswalk.dta", unmatched(none)

* Match on nickname + lastname + DOB
		
	*remove duplicates
	duplicates tag lastname nickname_v DOB, gen(dup)
	count if dup>=1 // 431
	*drop duplicates so only unique remain
		drop if dup>=1
		drop dup 

	*rename vars so merges cleanly using nickname instead of firstname
	rename firstname_v firstname_nm
		label var firstname_nm "fname - no nickname match"
	rename nickname_v firstname_v
		label var firstname_v "nickname-crosswalk"
	merge 1:m lastname firstname_v DOB using "$data\ma_voterfile_after1980_clean.dta", keep (1 3 4 5) nogen update
	
	preserve
	keep sasid lalvoterid
	keep if !missing(lalvoterid)
	keep if !missing(sasid)
	duplicates drop sasid, force
	count // 1
	save match5.dta, replace
	restore
	
	* Limit to unmatched remaining
	merge m:1 sasid using match5.dta, keep(1) nogen //this is unmatched
	drop  lalvoterid-ever_voter

* Try lastname + nickname + birthmonth + birthyear

	duplicates tag lastname firstname_v birthmonth birthyear, gen(dup)
	*drop duplicates so only unique remain
	drop if dup>=1 // 90
	drop dup
	
	merge 1:m lastname firstname_v birthmonth birthyear using "$data\ma_voterfile_after1980_clean.dta", keep (1 3 4 5) nogen update
	
	keep sasid lalvoterid
	keep if !missing(lalvoterid)
	keep if !missing(sasid)
	duplicates drop sasid, force
	count // 3
	save match6.dta, replace
	

use "$data/remaining_charterapps_tomatch_V2.dta", clear
	* Limit to unmatched remaining
	merge m:1 sasid using match6.dta, keep(1) nogen // this is unmatched (7,459 remaining)
cap drop ever* 
cap drop lal*
save "$data/remaining_charterapps_tomatch_V3.dta", replace


*no hyphen (etc.)
use "$data\ma_voterfile_after1980_clean.dta", clear
gen lastname_nohyphen= subinstr(lastname_v,"-","",.)
gen lastname_noa= subinstr(lastname_nohyphen,"'","",.)
gen lastname_nop= subinstr(lastname_noa,".","",.)
gen lastname_nos= subinstr(lastname_nop," ","",.)

g lastname_no=lastname_nos

drop lastname_nohy lastname_noa lastname_nop lastname_nos

keep if lastname_n!=lastname_v
tempfile nohyphen
save "`nohyphen'"

use "$data/remaining_charterapps_tomatch_V3.dta", clear
ren lastname_v lastname_no
duplicates tag lastname_no firstname_v birthmonth birthyear birthday, gen(dup)
	drop if dup>=1
	drop dup

	merge 1:m lastname_no firstname_v birthmonth birthyear birthday using "`nohyphen'", keep (1 3 4 5 ) nogen update
	
	
	keep sasid lalvoterid
	keep if !missing(lalvoterid)
	keep if !missing(sasid)
	duplicates drop sasid, force
	count // 332
	save match7.dta, replace

use "$data/remaining_charterapps_tomatch_V3.dta", clear
	
	merge m:1 sasid using match7.dta, keep(1) nogen // this is unmatched (6,981 remaining)
drop lal 
save "$data/remaining_charterapps_tomatch_V4.dta", replace

}

if $fuzzymatch==1{

************************* Attempt at jarowinkler strategy **********************
use "$data/remaining_charterapps_tomatch_V4.dta", clear

* Start by changing format of gender var in charterapp dataset to match voter dataset
tab gender
decode gender, gen(male_v)
replace male_v="0" if male_v=="Female"
replace male_v="1" if male_v=="Male"
destring male_v, replace
tab male_v
drop gender

* Rename all vars with new _s suffix to distinguish charterapp dataset from voter dataset
rename lastname_v lastname_s
rename firstname_v firstname_s
rename mname_v mname_s
rename DOB_v DOB_s
rename birthmonth_v birthmonth_s
rename birthday_v birthday_s
rename birthyear_v birthyear_s
rename male_v male_s

save "$data/remaining_charterapps_tomatch_V4_jw.dta", replace
clear

* Add fact vars to both datasets to help with joinby
	use "$data/remaining_charterapps_tomatch_V4_jw.dta"
	gen z = 1
	duplicates drop
	save "$data/remaining_charterapps_tomatch_V4_jw_z.dta", replace
	clear

	use "$data\ma_voterfile_after1980_clean.dta"
	gen z = 1
	drop if birthyear<=1983 //reduce candidates for matching
	drop if birthyear>=2000
	save "$data\ma_voterfile_after1980_clean_z.dta", replace
	clear
	clear all

*jarowinkler matching

/* new paths */
global charterdata = "$data/remaining_charterapps_tomatch_V4_jw_z.dta"
global voterfile = "$data\ma_voterfile_after1980_clean_z.dta"

**how long is the charter data
use "$charterdata", clear
local charterkids = _N

clear


forval i = 1(1)`charterkids' {
di in yellow "`i'"
quietly{
	use "$charterdata" in `i', clear

	joinby using "$voterfile"

	**sex comparison
	keep if male_s == male_v

	**dob comparisons
	gen yearmatch = (year(DOB_s) == year(DOB_v))
	gen monthmatch = (month(DOB_s) == month(DOB_v))
	gen daymatch = (day(DOB_s) == day(DOB_v))

		**keep some subset of the data, conditional on dob matches
		**eg, keep if more than two agree
		keep if yearmatch + monthmatch + daymatch >= 2

	**name distances are computationally costly
	**so only do this after we limit the data to guys with close matches in dob
	jarowinkler firstname_s firstname_v, gen(fdist)
	jarowinkler lastname_s lastname_v, gen(ldist)
	replace fdist = 1-fdist
	replace ldist = 1-ldist

	**other possible string comparisons
	**soundex or phonex
	gen fsoundex = (soundex(firstname_s) == soundex(firstname_v))
	gen lsoundex = (soundex(lastname_s) == soundex(lastname_v))

	**we can increase or decrease this after some testing
	keep if fdist <= .3 & ldist <= .3

	**make a dummy score. the weights here are totally wrong...
	**but closer to 0 -> better match
	gen score = 10*fdist + 10*ldist + (1-yearmatch) + (1-monthmatch) + (1-daymatch)

	egen rank = rank(score), track
	
	if (`i' == 1){
		save "$data/jarowinkler_matches.dta", replace
	}
	else {
		append using "$data/jarowinkler_matches.dta" 
format firstname_v %30s
 format lastname_v %30s
 format mname_v %30s
		save "$data/jarowinkler_matches.dta", replace
	}
}
} 

****************************** Use jarowinkler strategy ****************************

use "$data/jarowinkler_matches.dta", clear
keep if score <= 1 &rank==1 //only those who have at least a birthday match (for now)
cap drop min
bys sasid: egen min = min(score)
keep if min == score // 250

keep sasid lal
duplicates drop
*1 ties remain, break randomly
duplicates report sasid 
sample 1, count by(sasid)
save match8.dta, replace //247


use matched_to_voters.dta, clear
append using match4.dta
append using match5.dta
append using match6.dta
append using match7.dta
append using match8.dta

duplicates drop
* a few matched to same voter id -- check this more later
sample 1, count by(lal)
cap drop min
save matched_to_voters2.dta, replace



**** KP ADDED -- try some more ranges **** 
	use "$data/jarowinkler_matches.dta", clear
	merge m:1 sasid using matched_to_voters2.dta, keep(1) nogen // this is remaining jarowinklers (21,298)
* Keep if DOBs are same and rank=1

	* br if DOB_s==DOB_v // 632, did eyeball scan of last names and looks like this catches ones where voter lname had been shortened, misspelled
	keep if DOB_s==DOB_v & rank==1 &score<2
cap drop min 
bys sasid: egen min = min(score)
keep if min == score
	count // 178

	keep sasid lal
	duplicates drop
 duplicates report sasid
	save match9.dta, replace // 177

* Keep if score<3 & lastname known error
		use "$data/jarowinkler_matches.dta", clear
		merge m:1 sasid using matched_to_voters2.dta, keep(1) nogen 
		merge m:1 sasid using match9.dta, keep(1) nogen // this is remaining jarowinklers (18,031)

	keep if score<3 // 65
	* br lastname_s lastname_v firstname_s firstname_v DOB_s DOB_v // eyeball scan of these; would accept all -- series of reasonable typos
gen lastname_nohyphen= subinstr(lastname_v,"-","",.)
gen lastname_noa= subinstr(lastname_nohyphen,"'","",.)
gen lastname_nop= subinstr(lastname_noa,".","",.)
gen lastname_nos= subinstr(lastname_nop," ","",.)

g lastname_no=lastname_nos

drop lastname_nohy lastname_noa lastname_nop lastname_nos
keep if firstname_v==firstname_s & lastname_s==lastname_no //9

cap drop min
bys sasid: egen min = min(score)
keep if min == score

	count // 9
	
	keep sasid lal
	duplicates drop
	duplicates report sasid 
	save match10.dta, replace // 7

* Keep if score<2 & 2/3 of birthday matches
		use "$data/jarowinkler_matches.dta", clear
		merge m:1 sasid using matched_to_voters2.dta, keep(1) nogen 
		merge m:1 sasid using match9.dta, keep(1) nogen
		merge m:1 sasid using match10.dta, keep(1) nogen // this is remaining jarowinklers (17,468)

	keep if score<2 &rank==1 
	cap drop min 
bys sasid: egen min = min(score)
keep if min == score //62

* br lastname_s lastname_v firstname_s firstname_v DOB_s DOB_v // eyeball scan of these; mostly still looking reasonable
	
	gen birthscore = yearmatch + monthmatch + daymatch
	keep if birthscore <=2 // only 2 deleted so all have at least 2/3 of birthday matching; think this is reasonable
	
	count // 58
	
	g minitial_s = substr(mname_s, 1,1)
	g minitial_v = substr(mname_v, 1,1)

	keep if minitial_s==minitial_v |mname_v=="" |mname_s=="" //20
	
	*these guys are more dubious -- some are probably matches and some aren't
	keep if  firstname_v==firstname_s|lastname_v==lastname_s // must be exact match on SOMETHING
	
	* br DOB_s DOB_v lastname_s   lastname_v firstname_s firstname_v mname_s mname_v ldist fdist score rank
	*birthyear must be close
	keep if abs(birthyear_s-birthyear_v)<=1 //18
	keep if birthmonth_s==birthmonth_v //15

	keep sasid lal
	duplicates drop

	save match11.dta, replace // 15


* Check to see if anything left worth keeping, seems like not so much
		use "$data/jarowinkler_matches.dta", clear
		merge m:1 sasid using matched_to_voters2.dta, keep(1) nogen 
		merge m:1 sasid using match9.dta, keep(1) nogen
		merge m:1 sasid using match10.dta, keep(1) nogen
		merge m:1 sasid using match11.dta, keep(1) nogen

	count if score<4 // 1,307
	* br lastname_s lastname_v firstname_s firstname_v DOB_s DOB_v
		
		// eyeball scan; probably some left at this point that are matches, but could also just as
		// easily be different people. Could play with ranges of ldist/fdist some more if you think
		// it's worth it.

		
* Save what's been matched to voters at this point
	use matched_to_voters2.dta, clear
	append using match9.dta
	append using match10.dta
	append using match11.dta

	duplicates drop // 0 obs dropped
	save matched_to_voters3.dta, replace

}
